Data is taken from Stack Overflow Annual Developer Survey (2018-2021). The survey data will reviewed in two aspects:
To compare income of developers & data experts across countries, and factoring in the effect of tax and purchasing power.
To evaluate the effect of years of professional coding experience on their income, after adjusting for usage of Python programming and education level.
For our data science project, we activated the following packages. We may directly call functions in other packages with :: to avoid namespace collisions, given that data used in this project is not huge, there should be no concerns about the efficiency.
# Load necessary packages
pacman::p_load(tidyverse, lubridate, modelr, broom, plotly, CGPfunctions, equatiomatic, rvest )
# Setting working directory, seed & options
setwd('C:/SMU Study/R Programming/DASR/Capstone Project/Submission/')
set.seed(20211031)
options(scipen = 9999)
defaultW <- getOption("warn")
options(warn = -1)
options(tibble.width = Inf) # displays all columns.
options(tibble.print_max = Inf) # to show all the rows.
options(dplyr.summarise.inform = FALSE) # Suppress summarise infoWe imported the results of survey conducted from 2018 to 2021.
# data18=read_csv("C:/SMU Study/R Programming/DASR/Capstone Project/Data/Data Scientist Earn/2018/survey_results_public.csv")
# data19=read_csv("C:/SMU Study/R Programming/DASR/Capstone Project/Data/Data Scientist Earn/2019/survey_results_public.csv")
# data20=read_csv("C:/SMU Study/R Programming/DASR/Capstone Project/Data/Data Scientist Earn/2020/survey_results_public.csv")
# data21=read_csv("C:/SMU Study/R Programming/DASR/Capstone Project/Data/Data Scientist Earn/2021/survey_results_public.csv")# Save raw dataset files in .RDS format
# saveRDS(data18, file = "LEE_data18.rds")
# saveRDS(data19, file = "LEE_data19.rds")
# saveRDS(data20, file = "LEE_data20.rds")
# saveRDS(data21, file = "LEE_data21.rds")
data18 <- readRDS("LEE_data18.rds")
data19 <- readRDS("LEE_data19.rds")
data20 <- readRDS("LEE_data20.rds")
data21 <- readRDS("LEE_data21.rds")Data categories & variable names used in each survey year may be slightly different, we will align the values and rename the variables.
Within the datasets, there are Converted Annual Compensation in US dollar (to be named as ConvertedCompYearly), which will serve as our dependent (continuous y) variable.
Also, the dataset contains various attributes of the respondents, such as Country, Years of Professional Coding (YearsCodingProf), Gender, Education Level (EdLevel) etc.
In Problem 1, Country will be our dependent variable (categorical x1), and it will show the significant difference of income among countries.
Hence in Problem 2, we will only focus on the country of United Stated which has the most respondents in this survey, YearsCodingProf will serve as our dependent variable (continuous x1); usage of Python programming (fPython, derived from LanguageWorkedWith) and EdLevel will be one by one applied as our moderator (categorical x2).
# glimpse(data18)
# glimpse(data19)
# glimpse(data20)
glimpse(data21)## Rows: 83,439
## Columns: 48
## $ ResponseId <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13~
## $ MainBranch <chr> "I am a developer by profession", "I am a~
## $ Employment <chr> "Independent contractor, freelancer, or s~
## $ Country <chr> "Slovakia", "Netherlands", "Russian Feder~
## $ US_State <chr> NA, NA, NA, NA, NA, "Georgia", "New Hamps~
## $ UK_Country <chr> NA, NA, NA, NA, "England", NA, NA, NA, NA~
## $ EdLevel <chr> "Secondary school (e.g. American high sch~
## $ Age1stCode <chr> "18 - 24 years", "11 - 17 years", "11 - 1~
## $ LearnCode <chr> "Coding Bootcamp;Other online resources (~
## $ YearsCode <chr> NA, "7", NA, NA, "17", NA, "3", "4", "6",~
## $ YearsCodePro <chr> NA, NA, NA, NA, "10", NA, NA, NA, "4", "4~
## $ DevType <chr> "Developer, mobile", NA, NA, "Developer, ~
## $ OrgSize <chr> "20 to 99 employees", NA, NA, "100 to 499~
## $ Currency <chr> "EUR European Euro", NA, NA, "EUR Europea~
## $ CompTotal <dbl> 4800, NA, NA, NA, NA, NA, NA, NA, NA, 420~
## $ CompFreq <chr> "Monthly", NA, NA, "Monthly", NA, NA, NA,~
## $ LanguageHaveWorkedWith <chr> "C++;HTML/CSS;JavaScript;Objective-C;PHP;~
## $ LanguageWantToWorkWith <chr> "Swift", NA, "Julia;Python;Rust", "JavaSc~
## $ DatabaseHaveWorkedWith <chr> "PostgreSQL;SQLite", "PostgreSQL", "SQLit~
## $ DatabaseWantToWorkWith <chr> "SQLite", NA, "SQLite", NA, "Cassandra;El~
## $ PlatformHaveWorkedWith <chr> NA, NA, "Heroku", NA, NA, NA, NA, "Heroku~
## $ PlatformWantToWorkWith <chr> NA, NA, NA, NA, NA, NA, NA, "Heroku", NA,~
## $ WebframeHaveWorkedWith <chr> "Laravel;Symfony", "Angular;Flask;Vue.js"~
## $ WebframeWantToWorkWith <chr> NA, NA, "Flask", "Angular;jQuery", "Flask~
## $ MiscTechHaveWorkedWith <chr> NA, "Cordova", "NumPy;Pandas;TensorFlow;T~
## $ MiscTechWantToWorkWith <chr> NA, NA, "Keras;NumPy;Pandas;TensorFlow;To~
## $ ToolsTechHaveWorkedWith <chr> NA, "Docker;Git;Yarn", NA, NA, "Docker;Gi~
## $ ToolsTechWantToWorkWith <chr> NA, "Git", NA, NA, "Docker;Git;Kubernetes~
## $ NEWCollabToolsHaveWorkedWith <chr> "PHPStorm;Xcode", "Android Studio;Intelli~
## $ NEWCollabToolsWantToWorkWith <chr> "Atom;Xcode", NA, "IPython/Jupyter;RStudi~
## $ OpSys <chr> "MacOS", "Windows", "MacOS", "Windows", "~
## $ NEWStuck <chr> "Call a coworker or friend;Visit Stack Ov~
## $ NEWSOSites <chr> "Stack Overflow", "Stack Overflow", "Stac~
## $ SOVisitFreq <chr> "Multiple times per day", "Daily or almos~
## $ SOAccount <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes",~
## $ SOPartFreq <chr> "A few times per month or weekly", "Daily~
## $ SOComm <chr> "Yes, definitely", "Yes, definitely", "Ye~
## $ NEWOtherComms <chr> "No", "No", "Yes", "No", "No", "No", "No"~
## $ Age <chr> "25-34 years old", "18-24 years old", "18~
## $ Gender <chr> "Man", "Man", "Man", "Man", "Man", "Prefe~
## $ Trans <chr> "No", "No", "No", "No", "No", "No", "No",~
## $ Sexuality <chr> "Straight / Heterosexual", "Straight / He~
## $ Ethnicity <chr> "White or of European descent", "White or~
## $ Accessibility <chr> "None of the above", "None of the above",~
## $ MentalHealth <chr> "None of the above", "None of the above",~
## $ SurveyLength <chr> "Appropriate in length", "Appropriate in ~
## $ SurveyEase <chr> "Easy", "Easy", "Easy", "Neither easy nor~
## $ ConvertedCompYearly <dbl> 62268, NA, NA, NA, NA, NA, NA, NA, NA, 51~
A few points have to be catered in the step:
data18a <- data18 %>%
mutate(year=2018,
ConvertedCompYearly=ifelse(SalaryType=="Yearly", ConvertedSalary,
ifelse(SalaryType=="Monthly", ConvertedSalary*12,
ifelse(SalaryType=="Weekly", ConvertedSalary*52, NA) ) )
) %>%
mutate(YearsCodingProf=factor(YearsCodingProf,
levels=c("0-2 years", "3-5 years", "6-8 years", "9-11 years",
"12-14 years", "15-17 years", "18-20 years", "21-23 years",
"24-26 years", "27-29 years", "30 or more years"))
) %>%
mutate(MainBranch="Z.Missing") %>%
rename(Hobbyist=Hobby,
EdLevel=FormalEducation, Ethnicity=RaceEthnicity,
JobSat=JobSatisfaction, OpSys=OperatingSystem,
OrgSize=CompanySize) %>%
select(-Age)
data19a <- data19 %>%
mutate(year=2019,
ConvertedCompYearly=ifelse(CompFreq=="Yearly", ConvertedComp,
ifelse(CompFreq=="Monthly", ConvertedComp*12,
ifelse(CompFreq=="Weekly", ConvertedComp*52, NA) ) )
) %>%
mutate(YearsCodingProf = ifelse(YearsCodePro=="Less than 1 year", 0,
ifelse(YearsCodePro=="More than 50 years",99,
YearsCodePro) ),
nCodeExp=as.numeric(YearsCodingProf),
YearsCodingProf = cut(nCodeExp, breaks=c(-1, seq(2,30, 3) , 99),
labels=c("0-2 years", "3-5 years", "6-8 years", "9-11 years",
"12-14 years", "15-17 years", "18-20 years", "21-23 years",
"24-26 years", "27-29 years", "30 or more years") )
) %>%
select(-Age)
# levels(data19$YearsCodingProf)
data20a <- data20 %>%
mutate(year=2020,
ConvertedCompYearly=ifelse(CompFreq=="Yearly", ConvertedComp,
ifelse(CompFreq=="Monthly", ConvertedComp*12,
ifelse(CompFreq=="Weekly", ConvertedComp*52, NA) ) )
) %>%
mutate(YearsCodingProf = ifelse(YearsCodePro=="Less than 1 year", 0,
ifelse(YearsCodePro=="More than 50 years",99,
YearsCodePro) ),
nCodeExp=as.numeric(YearsCodingProf),
YearsCodingProf = cut(nCodeExp, breaks=c(-1, seq(2,30, 3) , 99),
labels=c("0-2 years", "3-5 years", "6-8 years", "9-11 years",
"12-14 years", "15-17 years", "18-20 years", "21-23 years",
"24-26 years", "27-29 years", "30 or more years") )
) %>%
mutate(Student="Z.Missing") %>%
select(-Age)
data21a <- data21 %>%
mutate(year=2021, YearsCodingProf = ifelse(YearsCodePro=="Less than 1 year", 0,
ifelse(YearsCodePro=="More than 50 years",99,
YearsCodePro) ),
nCodeExp=as.numeric(YearsCodingProf),
YearsCodingProf = cut(nCodeExp, breaks=c(-1, seq(2,30, 3) , 99),
labels=c("0-2 years", "3-5 years", "6-8 years", "9-11 years",
"12-14 years", "15-17 years", "18-20 years", "21-23 years",
"24-26 years", "27-29 years", "30 or more years") )
) %>%
mutate(CurrencySymbol = substr(Currency, 1, 3) ) %>%
mutate(Student="Z.Missing") %>%
rename(LanguageWorkedWith=LanguageHaveWorkedWith,
DatabaseWorkedWith=DatabaseHaveWorkedWith,
Respondent=ResponseId) %>%
select(-Age)The first problem is to compare the income across countries, however, if countries do not have sufficient number of respondents, the analysis result would be inconclusive. Hence it is proposed to focus on the top 10 countries with most respondents.
ho <-
bind_rows(data18a %>%
select(ConvertedCompYearly, Country),
data19a %>%
select(ConvertedCompYearly, Country),
data20a %>%
select(ConvertedCompYearly, Country),
data21a %>%
select(ConvertedCompYearly, Country)
) %>%
filter(!is.na(ConvertedCompYearly) & ConvertedCompYearly>0 ) %>%
mutate(Country=ifelse(Country=="United States of America", "United States", Country)) %>%
mutate(Country=ifelse(Country=="United Kingdom of Great Britain and Northern Ireland", "United Kingdom", Country)) %>%
group_by(Country) %>%
summarise(N=n()) %>%
ungroup() %>%
arrange(desc(N) )
Top10_Country <- ho$Country[1:10]
Top10_Country## [1] "United States" "India" "United Kingdom" "Germany"
## [5] "Canada" "France" "Brazil" "Poland"
## [9] "Netherlands" "Australia"
This information will be used to derive indicators in subsequent steps.
hoTemp <-
bind_rows(data18a %>%
select(ConvertedCompYearly, Country, DevType, LanguageWorkedWith, DatabaseWorkedWith),
data19a %>%
select(ConvertedCompYearly, Country, DevType, LanguageWorkedWith, DatabaseWorkedWith),
data20a %>%
select(ConvertedCompYearly, Country, DevType, LanguageWorkedWith, DatabaseWorkedWith),
data21a %>%
select(ConvertedCompYearly, Country, DevType, LanguageWorkedWith, DatabaseWorkedWith)
) %>%
filter(!is.na(ConvertedCompYearly) & ConvertedCompYearly>0)
ho <- hoTemp %>%
select(DevType) %>%
separate(col=DevType,
paste0("v", str_pad( as.character( seq(1,40,1)), 2, pad="0" ) ),
sep = ";")
table(ho$v40) # To ensure all BLANK## < table of extent 0 >
AllDevType <- ho %>%
pivot_longer(cols=paste0("v", str_pad( as.character( seq(1,40,1)), 2, pad="0" ) ),
names_to = "vv",
values_to = "role") %>%
filter(!is.na(role)) %>%
group_by(role) %>%
summarise(NN=n()) %>%
arrange(desc(NN))
head(AllDevType, 25)## # A tibble: 25 x 2
## role NN
## <chr> <int>
## 1 Developer, full-stack 74524
## 2 Developer, back-end 69658
## 3 Developer, front-end 44047
## 4 Developer, desktop or enterprise applications 28142
## 5 Back-end developer 25008
## 6 Developer, mobile 22522
## 7 DevOps specialist 22164
## 8 Full-stack developer 21701
## 9 Database administrator 20753
## 10 System administrator 18676
## 11 Designer 16170
## 12 Front-end developer 15973
## 13 Data scientist or machine learning specialist 12926
## 14 Data or business analyst 12741
## 15 Developer, embedded applications or devices 11228
## 16 Engineering manager 10179
## 17 Developer, QA or test 9975
## 18 Engineer, data 9637
## 19 Student 9388
## 20 Product manager 8195
## 21 Mobile developer 8034
## 22 Desktop or enterprise applications developer 7536
## 23 Academic researcher 7258
## 24 Educator 6116
## 25 Engineer, site reliability 5454
ho <- hoTemp %>%
select(LanguageWorkedWith) %>%
separate(col=LanguageWorkedWith,
paste0("v", str_pad( as.character( seq(1,40,1)), 2, pad="0" ) ),
sep = ";")
table(ho$v40) # To ensure all BLANK## < table of extent 0 >
AllLang <- ho %>%
pivot_longer(cols=paste0("v", str_pad( as.character( seq(1,40,1)), 2, pad="0" ) ),
names_to = "vv",
values_to = "role") %>%
filter(!is.na(role)) %>%
group_by(role) %>%
summarise(NN=n()) %>%
arrange(desc(NN))
head(AllLang, 25)## # A tibble: 25 x 2
## role NN
## <chr> <int>
## 1 JavaScript 122961
## 2 SQL 98539
## 3 HTML/CSS 81569
## 4 Python 71137
## 5 Java 65662
## 6 C# 57074
## 7 TypeScript 47955
## 8 PHP 44112
## 9 C++ 34486
## 10 Bash/Shell/PowerShell 34250
## 11 Bash/Shell 30952
## 12 C 28988
## 13 HTML 27672
## 14 CSS 26477
## 15 Go 16483
## 16 Node.js 16443
## 17 Ruby 16095
## 18 Kotlin 12111
## 19 Swift 11379
## 20 R 9303
## 21 VBA 9026
## 22 Objective-C 8518
## 23 Assembly 7824
## 24 Rust 7134
## 25 Scala 7010
ho <- hoTemp %>%
select(DatabaseWorkedWith) %>%
separate(col=DatabaseWorkedWith,
paste0("v", str_pad( as.character( seq(1,40,1)), 2, pad="0" ) ),
sep = ";")
table(ho$v40) # To ensure all BLANK## < table of extent 0 >
AllDB <- ho %>%
pivot_longer(cols=paste0("v", str_pad( as.character( seq(1,40,1)), 2, pad="0" ) ),
names_to = "vv",
values_to = "role") %>%
filter(!is.na(role)) %>%
group_by(role) %>%
summarise(NN=n()) %>%
arrange(desc(NN))
head(AllDB, 25)## # A tibble: 25 x 2
## role NN
## <chr> <int>
## 1 MySQL 78695
## 2 PostgreSQL 60830
## 3 SQLite 43837
## 4 MongoDB 40784
## 5 Microsoft SQL Server 40381
## 6 Redis 34306
## 7 Elasticsearch 25564
## 8 MariaDB 24937
## 9 Oracle 21966
## 10 Firebase 15575
## 11 SQL Server 14470
## 12 DynamoDB 9759
## 13 Cassandra 5320
## 14 Other(s): 3625
## 15 Microsoft Azure (Tables, CosmosDB, SQL, etc) 2926
## 16 Couchbase 2353
## 17 Memcached 2130
## 18 Amazon DynamoDB 1991
## 19 Amazon RDS/Aurora 1979
## 20 Google Cloud Storage 1830
## 21 IBM DB2 1797
## 22 IBM Db2 895
## 23 Neo4j 841
## 24 Amazon Redshift 814
## 25 Google BigQuery 748
Country, Organization Size (OrgSize),EdLevel), Gender, operating system (OpSys) etc.;rawdata <- bind_rows(data18a, data19a, data20a, data21a)
rawdata1 <- rawdata %>%
mutate(Country=ifelse(Country=="United States of America", "United States", Country)) %>%
mutate(Country=ifelse(Country=="United Kingdom of Great Britain and Northern Ireland", "United Kingdom", Country)) %>%
select(-starts_with("Assess"), -starts_with("Ads"), -starts_with("JobContact"),
-starts_with("JobEmail"), -starts_with("HypotheticalTools")
) %>%
filter(!is.na(ConvertedCompYearly) & ConvertedCompYearly>0) %>%
filter(!is.na(YearsCodingProf) ) %>%
filter(!is.na(EdLevel) ) %>%
filter(Country %in% Top10_Country) %>%
filter(Employment %in% c("Employed full-time", "Independent contractor, freelancer, or self-employed")
& Student %in% c("No", "Yes, part-time", "Z.Missing") ) %>%
filter(MainBranch %in% c("I am a developer by profession", "Z.Missing") ) %>%
filter(grepl("Data scientist",DevType) |
grepl("Data or business analyst",DevType) |
grepl("Engineer, data",DevType) |
grepl("Engineer, site reliability",DevType) |
grepl("Scientist",DevType) |
grepl("Developer",DevType) |
grepl("developer",DevType)
) %>%
mutate(year=as_factor(year) ) %>%
mutate(OrgSize=str_replace(OrgSize,"2-9","Fewer than 10") ) %>%
mutate(OrgSize=str_replace(OrgSize,"2 to 9","Fewer than 10") ) %>%
mutate(OrgSize=str_replace(OrgSize,"Just me - I am a freelancer, sole proprietor, etc.", "Fewer than 10 employees") ) %>%
mutate(OrgSize=str_replace(OrgSize,"I don't know","Unknown") ) %>%
mutate(OrgSize=replace_na(OrgSize,"Unknown") ) %>%
mutate(OrgSize=factor(OrgSize,
levels=c("Fewer than 10 employees",
"10 to 19 employees",
"20 to 99 employees",
"100 to 499 employees",
"500 to 999 employees",
"1,000 to 4,999 employees",
"5,000 to 9,999 employees",
"10,000 or more employees",
"Unknown"))
) %>%
mutate(fData=ifelse(grepl("Data scientist",DevType) |
grepl("Data or business analyst",DevType) |
grepl("Engineer, data",DevType) |
grepl("Engineer, site reliability",DevType) |
grepl("Scientist",DevType), 1, 0) ) %>%
mutate(fDeve=ifelse(grepl("Developer",DevType) |
grepl("developer",DevType), 1, 0) ) %>%
filter(fData==1 | fDeve==1 ) %>%
mutate(fJavaScript=ifelse(grepl("JavaScript",LanguageWorkedWith), 1, 0) ) %>%
mutate(fSQL=ifelse(grepl("SQL",LanguageWorkedWith), 1, 0) ) %>%
mutate(fPython=ifelse(grepl("Python",LanguageWorkedWith), 1, 0) ) %>%
mutate(fJava=ifelse(grepl("Java;",LanguageWorkedWith), 1, 0) ) %>%
mutate(fRlang=ifelse(grepl("R;",LanguageWorkedWith), 1, 0) ) %>%
mutate(fClang=ifelse(grepl("C#;",LanguageWorkedWith) |
grepl("C++;",LanguageWorkedWith, fixed=TRUE) |
grepl("C;",LanguageWorkedWith) , 1, 0) )
# Grouping EdLevel
# table(rawdata1$EdLevel)
rawdata2 <- rawdata1 %>%
mutate(xEdu=factor(EdLevel,
levels=c("Primary/elementary school",
"I never completed any formal education",
"Something else",
"Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)",
"Associate degree",
"Associate degree (A.A., A.S., etc.)",
"Some college/university study without earning a degree",
"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",
"Bachelor’s degree (BA, BS, B.Eng., etc.)",
"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",
"Master’s degree (MA, MS, M.Eng., MBA, etc.)",
"Other doctoral degree (Ph.D, Ed.D., etc.)",
"Other doctoral degree (Ph.D., Ed.D., etc.)",
"Professional degree (JD, MD, etc.)" ),
labels=c(rep("1. BelowCollege", 4),
rep("2. College",3),
rep("3. Degree",2),
rep("4. Master",2),
rep("5. Doctor",3) )
)
)
# table(rawdata2$xEdu)
# Grouping Gender & OpSys
# table(rawdata2$Gender); table(rawdata2$OpSys);
rawdata3 <- rawdata2 %>%
mutate(xSex=factor(Gender,
levels=c("Man",
"Male",
"Man;Or, in your own words:",
"Woman",
"Female",
"Woman;Or, in your own words:"
),
labels=c(rep("Male", 3),
rep("Female",3)
)
)
) %>%
mutate(xSex=forcats::fct_explicit_na(xSex, "Others") ) %>%
mutate(xOS=factor(OpSys,
levels=c("Windows", "MacOS", "Linux-based" ),
labels=c("Windows", "MacOS", "Linux-based" )
)
) %>%
mutate(xOS=forcats::fct_explicit_na(xOS, "Others") )
table(rawdata3$xOS)##
## Windows MacOS Linux-based Others
## 38418 30908 20460 2300
rawdata4 <- rawdata3 %>%
mutate(Country = forcats::fct_relevel( as_factor(Country), "United States" ) ) %>%
select(year, Country, CurrencySymbol, ConvertedCompYearly,
OrgSize, YearsCodingProf, xEdu, xSex, xOS,
fDeve, fData, fJavaScript, fSQL, fPython, fJava, fRlang, fClang,
DevType, EdLevel, Gender, OpSys, Ethnicity,
Employment, Student,
LanguageWorkedWith, DatabaseWorkedWith, Respondent, nCodeExp)Noted that income showed increase in an exponential curve, a number of respondents declared their annual income over USD1M. As data is collected from online voluntary surveys, some respondents might mis-understand the questions and mis-fill the answers. Distributions of coding experience between those with income below & above USD1M are similar, majority has less than 8 years professional coding experience. Even if the exceptionally high income is accurate, it might not be due to their coding experience or education level. Therefore, respondents with income above USD1M will be excluded from this analysis.
rawdata4 %>%
ggplot(aes(x=Country, y=ConvertedCompYearly)) +
geom_boxplot(notch = T) +
stat_summary(fun.y=mean, geom="point", shape=20, size=5, color="red", fill="red") +
labs(title="Boxplot Compensation: Too Many Exceptionally High Income",
subtitle=" - Annual Income as high as USD100M ?")ho <- rawdata4 %>%
arrange(ConvertedCompYearly)
ho$NN <- zoo::index(ho)
ho_interactive <- ho %>%
plot_ly(x = ~NN,
y = ~ConvertedCompYearly,
type="scatter",
mode='lines')
htmlwidgets::saveWidget(as_widget(ho_interactive), "index.html")
ho_interactiveho %>%
ggplot(aes(x=NN, y = ConvertedCompYearly ) ) +
scale_y_continuous(limits= c(0, 25e6) ) +
geom_hline(yintercept = 1e6, color = "red") +
annotate("text", x = 75000, y = 3e6,
label = "paste(italic(\"Income \n over $1M\") )",
color = "red", size = 4, parse = T) +
annotate("rect",
fill = "red", alpha = 0.1,
xmin = 85000, xmax = 100000,
ymin = 1e6, ymax = 25e6) +
geom_line(size = 2)Chart_HiInc <- ho %>%
filter(ConvertedCompYearly>=1e6) %>%
ggplot(aes(x=YearsCodingProf ) ) +
geom_bar(aes(y=(..count..)/sum(..count..) ) ) +
scale_y_continuous(limits=c(0,0.3)) +
ylab("proportions") +
labs(title="Income>=$1M" )+
theme(legend.position = "none",
axis.text.x = element_text(angle = 90),
text = element_text(family = "Courier") )
Chart_LoInc <- ho %>%
filter(ConvertedCompYearly<1e6) %>%
ggplot(aes(x=YearsCodingProf ) ) +
geom_bar(aes(y=(..count..)/sum(..count..) ) ) +
scale_y_continuous(limits=c(0,0.3)) +
ylab("proportions") +
labs(title="Income < $1M")+
theme(legend.position = "none",
axis.text.x = element_text(angle = 90),
text = element_text(family = "Courier") )
gridExtra::grid.arrange(Chart_LoInc, Chart_HiInc, ncol=2,
top = paste0("Extremely High Income Earners has similar",
"Coding Experience to Others"),
bottom = paste0("Even if high income is accurate, ",
"it might not be due to their coding experience")
)rawdata5 <- rawdata4 %>%
filter(ConvertedCompYearly<1e6)Even thought respondents with exceptionally high income (>USD1M) are excluded, it is observed that there are still a lot of outliers from boxplot. To remove those outliers, respondents with income within 15 to 85 percentile will be selected for further analysis.
rawdata5 %>%
ggplot(aes(x=Country, y=ConvertedCompYearly)) +
geom_boxplot(notch = T) +
stat_summary(fun.y=mean, geom="point", shape=20, size=5, color="red", fill="red") +
labs(title="Boxplot Compensation: A lot of Outliers",
subtitle=" - To remove reocrds <15 or >85 percentile")# plot( density(rawdata4$ConvertedCompYearly, na.rm=T) )
remove_outliers <- function(x, na.rm = T, ...) {
qnt <- quantile(x, probs=c(.15, .85), na.rm = na.rm, ...)
H <- 0 * IQR(x, na.rm = na.rm)
y <- x
y[x < (qnt[1] - H)] <- NA
y[x > (qnt[2] + H)] <- NA
y
}
rawdata5$Comp_Clean <- remove_outliers(rawdata5$ConvertedCompYearly)
rawdata6 <- rawdata5 %>%
filter(!is.na(Comp_Clean) ) %>%
select(-Comp_Clean)
rawdata6 %>%
ggplot(aes(x=Country, y=ConvertedCompYearly)) +
geom_boxplot(notch = T) +
stat_summary(fun=mean, geom="point", shape=20, size=5, color="red", fill="red") +
labs(title="Boxplot Compensation: US & Brazil have the highest income on average",
subtitle=paste0(" - After excluding below 15 or after 85 percentile",
"\n - Still have outliers") )
When further exploring the data by Slope Graph, it would reveal that average income of Poland/Brazil/India substantially dropped in 2021.
Density Plot would show that they got spike high at low-income in 2021, the distribution is very different from the previous.
Besides, Poland & Brazil have extraordinary fat-tailed for data from 2018-2020, which is very different from other countries.
rawdata6 %>%
mutate(year=as.factor(year) ) %>%
group_by(year, Country) %>%
summarise(mean_comp=round( mean(ConvertedCompYearly) , digits=0) , NN=n() ) %>%
ungroup() %>%
mutate(year=as.ordered(year)) %>%
newggslopegraph(Time=year, Measurement=mean_comp,
Grouping=Country,
LineThickness=2, YTextSize=3,
LineColor=c("India"="orange", "Brazil"="green",
"Poland"="red", rep("XXX",7) ),
Title="Annual Compensation Mean by Country & Year",
SubTitle=" - Poland/Brazil/India substantially Dropped in 2021",
Caption=NULL
) rawdata6 %>%
filter(year %in% c(2018, 2019, 2020, 2021) ) %>%
mutate(year=as.factor(year)) %>%
ggplot(aes(x=ConvertedCompYearly, color=year, fill=year)) +
ggplot2::geom_density(alpha=0.2) +
facet_wrap(.~Country, ncol=5) +
labs(title="Density Plot of Annual Compensation by Country & Year",
subtitle=paste0(" - Year 2021 got spike high at low-income for Poland/Brazil/India",
"\n - Poland/Brazil fat-tailed at high-income") ) +
theme(axis.text.x = element_text(angle = 45),
text = element_text(family = "Courier") )As we could not confirm if these is the true data pattern or data quality issues, we could exclude year 2021 and Poland/Brazil data from our analysis.
rawdata6B <- rawdata6 %>%
filter(year!=2021 & ! Country %in% c("Poland", "Brazil")) %>%
mutate(Country = droplevels(Country) )
rawdata6B %>%
ggplot(aes(x=Country, y=ConvertedCompYearly)) +
geom_boxplot(notch = T) +
geom_hline(yintercept=mean(rawdata6B$ConvertedCompYearly),
color="blue", alpha=0.7) +
stat_summary(fun=mean, geom="point", shape=20, size=5, color="red", fill="red") +
labs(title="Boxplot Compensation: US above average",
subtitle=paste0(" - Other countries look similar") ) +
theme(legend.position = "none",
axis.text.x = element_text(angle = 90),
text = element_text(family = "Courier") )bc <- MASS::boxcox(rawdata6B$ConvertedCompYearly ~ rawdata6B$Country) lambda <- bc$x[which.max(bc$y)]
print(paste0( "By BoxCox Transformation, lambda:", round(lambda, digits=4) ) )## [1] "By BoxCox Transformation, lambda:-0.0606"
rawdata7 <- rawdata6B %>%
mutate(Comp_BC= (ConvertedCompYearly ^lambda-1)/ lambda )qqnorm(rawdata7$Comp_BC)
qqline(rawdata7$Comp_BC)Chart7a <- rawdata7 %>%
ggplot() +
ggplot2::geom_density(aes(x=ConvertedCompYearly), color="red", fill="red", alpha=0.3) +
xlab("Nominal Income") +
labs(title="Before Transformation") +
theme_linedraw()
Chart7b <- rawdata7 %>%
ggplot() +
ggplot2::geom_density(aes(x=Comp_BC), color="blue", fill="blue", alpha=0.3) +
xlab("BoxCox Transformed") +
labs(title="After Transformation") +
theme_linedraw()
gridExtra::grid.arrange(Chart7a, Chart7b, ncol = 2, top = "DENSITY PLOT")rawdata7 %>% lm(ConvertedCompYearly ~ as.factor(Country), .) %>% gvlma::gvlma()##
## Call:
## lm(formula = ConvertedCompYearly ~ as.factor(Country), data = .)
##
## Coefficients:
## (Intercept) as.factor(Country)United Kingdom
## 105218 -32618
## as.factor(Country)Australia as.factor(Country)India
## -22724 -22294
## as.factor(Country)Germany as.factor(Country)France
## -29432 -44792
## as.factor(Country)Canada as.factor(Country)Netherlands
## -31036 -27961
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma::gvlma(x = .)
##
## Value p-value Decision
## Global Stat 2599.416009052128175 0.00000 Assumptions NOT satisfied!
## Skewness 2591.596422837437331 0.00000 Assumptions NOT satisfied!
## Kurtosis 1.378066632815573 0.24043 Assumptions acceptable.
## Link Function 0.000000000001219 1.00000 Assumptions acceptable.
## Heteroscedasticity 6.441519581873688 0.01115 Assumptions NOT satisfied!
rawdata7 %>% lm(Comp_BC ~ as.factor(Country), .) %>% gvlma::gvlma()##
## Call:
## lm(formula = Comp_BC ~ as.factor(Country), data = .)
##
## Coefficients:
## (Intercept) as.factor(Country)United Kingdom
## 8.2849 -0.1914
## as.factor(Country)Australia as.factor(Country)India
## -0.1211 -0.1394
## as.factor(Country)Germany as.factor(Country)France
## -0.1605 -0.2780
## as.factor(Country)Canada as.factor(Country)Netherlands
## -0.1751 -0.1567
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma::gvlma(x = .)
##
## Value p-value Decision
## Global Stat 252.2678013143135445 0.0000 Assumptions NOT satisfied!
## Skewness 0.0658450918879243 0.7975 Assumptions acceptable.
## Kurtosis 251.9679907397861598 0.0000 Assumptions NOT satisfied!
## Link Function 0.0000000000002322 1.0000 Assumptions acceptable.
## Heteroscedasticity 0.2339654826392464 0.6286 Assumptions acceptable.
Kurtosis (heavy-tailed) assumption is not satisfied, maybe because data has been truncated by excluding income above USD1M.
Average income among countries are signifcantly different.
AOV_Result <- rawdata7 %>%
aov(Comp_BC ~ Country, .)
summary(AOV_Result)## Df Sum Sq Mean Sq F value Pr(>F)
## Country 7 360 51.43 1827 <0.0000000000000002 ***
## Residuals 42706 1202 0.03
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
broom::glance(AOV_Result)## # A tibble: 1 x 6
## logLik AIC BIC deviance nobs r.squared
## <dbl> <dbl> <dbl> <dbl> <int> <dbl>
## 1 15645. -31272. -31194. 1202. 42714 0.230
By pairwise comparison, average income of most countries are different from others.
pairwise_comparison <- TukeyHSD(AOV_Result, which="Country",
order=F, conf.level=.999) %>%
broom::tidy() %>%
arrange(desc(estimate) )
pairwise_comparison2 <- pairwise_comparison %>%
mutate(pairwise = as.factor(ifelse(adj.p.value>0.001, "C",
ifelse(estimate<0, "A", "B") ) )
) %>%
mutate(estimate2=abs(estimate),
conf.high2=ifelse(estimate<0,-1*conf.high,conf.high),
conf.low2=ifelse(estimate<0,-1*conf.low,conf.low)
)
pairwise_comparison2 %>%
ggplot(aes(x = reorder(contrast, estimate),
y = estimate2,
color = pairwise) ) +
geom_point(size = 2) +
geom_errorbar(aes(ymin = conf.low2, ymax = conf.high2),
width = 0.5) +
scale_color_manual(name = "Difference",
labels = c( "A" = "Negative",
"B" = "Positive",
"C" = "Insignificant (99.9% CI)"),
values = c( "A" = "tomato2",
"B" = "blue",
"C" = "darkgray")
) +
geom_hline(yintercept = 0, color = "red") +
xlab("Country Comparison") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45),
legend.position = "top") +
coord_flip() +
labs(title="Post Hoc Analysis - Tukey HSD",
subtitle=paste0(" - some country means are close",
"\n - e.g. Netherlands, Germany, India, Australia, Canada & UK") )To better understand the difference of developers’ earning among countries, we would further compare their net income & purchasing power. Net Income is calculated by factoring in the Tax Rate of the corresponding country. Then it is adjusted by Big Mac Index established by The Economist to reflect the Purchasing Power.
Personal Income Tax Rate is taken from Trading Economics. And the latest updated tax rates (2021) would be applied to 2018-2021 data.
Big Mac Index is taken from the GitHub of The Economist.
library(rvest)
# taxrate <-
# "https://tradingeconomics.com/country-list/personal-income-tax-rate" %>%
# read_html() %>%
# html_node(".table") %>%
# html_table()
# saveRDS(taxrate, file = "LEE_taxrate.rds")
taxrate <- readRDS("LEE_taxrate.rds")
taxrate <- taxrate %>%
select(Country, Last, Previous)
# bigmac <-
# "https://github.com/TheEconomist/big-mac-data/blob/6c79b1408de38941e7e17c47f7824179bc9e2e2e/output-data/big-mac-full-index.csv" %>%
# read_html() %>%
# html_node(".js-csv-data") %>%
# html_table()
# saveRDS(bigmac, file = "LEE_bigmac.rds")
bigmac <- readRDS("LEE_bigmac.rds")
bigmac <- bigmac %>%
select(-X1)
names(bigmac) <- bigmac[1,]
bigmac <- bigmac[-1,]
bigmac <- bigmac %>%
mutate(year = as.numeric(substr(date, 1, 4) ),
USD_raw=as.numeric(USD_raw) ) %>%
filter(year %in% c(2018, 2019, 2020, 2021) & substr(date,6,7)=="07") %>%
select(year, name, USD_raw)rawdata7A <- rawdata7 %>%
mutate(year=as.numeric(as.character(year)),
name=ifelse(Country=="United Kingdom", "Britain",
ifelse(Country %in% c("France", "Germany", "Netherlands"), "Euro area",
as.character(Country) ) )
) %>%
left_join(taxrate, by = c("Country") ) %>%
left_join(bigmac, by = c("year", "name") ) %>%
mutate(AfterTaxInc=ConvertedCompYearly * (1 - Last/100),
BigMacInc = AfterTaxInc / (1+USD_raw),
Country=forcats::fct_relevel( as_factor(Country), "United States") )After adjusting with Tax Rate & Big Mac Index, India became the top country with the highest average income (in terms of purchasing power).
Boxplot_OrigInc <-
rawdata7A %>%
ggplot(aes(x=Country, y=ConvertedCompYearly)) +
geom_boxplot(notch = T) +
geom_hline(yintercept=mean(rawdata6B$ConvertedCompYearly),
color="blue", alpha=0.7) +
scale_y_continuous(limits = c(20000, 200000) ) +
stat_summary(fun=mean, geom="point", shape=20, size=5, color="black", fill="black") +
scale_colour_manual(values = c("red",rep("black",7)) ) +
labs(title="Nominal Income" ) +
theme(legend.position = "none",
plot.title = element_text(size=10),
axis.title.y=element_blank(), axis.title.x=element_blank(),
axis.text.x = element_text(angle = 90),
text = element_text(family = "Courier") )
Boxplot_AfterTax <-
rawdata7A %>%
ggplot(aes(x=Country, y=AfterTaxInc)) +
geom_boxplot(notch = T) +
geom_hline(yintercept=mean(rawdata7A$AfterTaxInc),
color="blue", alpha=0.7) +
scale_y_continuous(limits = c(20000, 200000) ) +
stat_summary(fun=mean, geom="point", shape=20, size=5, color="black", fill="black") +
scale_colour_manual(values = c("red",rep("black",7)) ) +
labs(title="After Tax Income") +
theme(legend.position = "none",
plot.title = element_text(size=10),
axis.title.y=element_blank(), axis.title.x=element_blank(),
axis.text.x = element_text(angle = 90),
text = element_text(family = "Courier") )
Boxplot_BigMac <-
rawdata7A %>%
ggplot(aes(x=Country, y=BigMacInc, color=Country)) +
geom_boxplot(notch = T) +
geom_hline(yintercept=mean(rawdata7A$BigMacInc),
color="blue", alpha=0.7) +
scale_y_continuous(limits = c(20000, 200000) ) +
stat_summary(fun=mean, geom="point", shape=20, size=5) +
scale_colour_manual(values = c(rep("black",3),"red",rep("black",4)) ) +
labs(title="After Tax & Big Mac Index" ) +
theme(legend.position = "none",
plot.title = element_text(size=10),
axis.title.y=element_blank(), axis.title.x=element_blank(),
axis.text.x = element_text(angle = 90),
text = element_text(family = "Courier") )
gridExtra::grid.arrange(Boxplot_OrigInc, Boxplot_AfterTax, Boxplot_BigMac,
ncol=3, left="Annual Income",
bottom="Country",
top="Income Looks Different After Tax & Adjusted with Big Mac Index")As shown in Part 1 that there is significant difference of income among countries, hence we will only focus on the country of United States which has the most respondents in this survey.
As YearsCodingProf had been grouped as a categorical variable in year 2018, we will only use 2019-2021 survey data to conduct analysis in Part 2.
table(data18$YearsCodingProf )##
## 0-2 years 12-14 years 15-17 years 18-20 years
## 23421 4287 3012 2830
## 21-23 years 24-26 years 27-29 years 3-5 years
## 1368 857 506 21362
## 30 or more years 6-8 years 9-11 years
## 1302 11385 7573
USbase <- rawdata6 %>%
filter(year!=2018 & Country=="United States"
& nCodeExp!=99) %>%
mutate(oCodeExp=nCodeExp) %>%
mutate_at(vars( c(nCodeExp) ),
~(. - mean(., na.rm = T) )
)
# mean(USbase$oCodeExp)bc <- MASS::boxcox(USbase$ConvertedCompYearly ~ 1) lambda_US <- bc$x[which.max(bc$y)]
print(paste0( "By BoxCox Transformation, lambda:", round(lambda_US, digits=4) ) )## [1] "By BoxCox Transformation, lambda:0.5859"
USbase <- USbase %>%
mutate(Comp_US= (ConvertedCompYearly ^lambda_US-1)/ lambda_US )It is observed from the below linear modeling output that:
Below College & College is indifferent, given that the 95% C.I. of coefficient estimate of Below College contains 0;Master & Doctor to income may be similar given that their coefficient estimates are very similar;Female may earn less;USlm1 <- USbase %>%
lm(Comp_US ~ (nCodeExp + xEdu + xSex) * fPython , .)
summary(USlm1)##
## Call:
## lm(formula = Comp_US ~ (nCodeExp + xEdu + xSex) * fPython, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1123.58 -187.99 -9.93 189.85 723.12
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1443.9735 14.0139 103.039 < 0.0000000000000002 ***
## nCodeExp 12.4601 0.2670 46.673 < 0.0000000000000002 ***
## xEdu2. College -11.4552 15.0161 -0.763 0.445555
## xEdu3. Degree 50.1827 14.3045 3.508 0.000452 ***
## xEdu4. Master 110.9655 15.2820 7.261 0.000000000000397 ***
## xEdu5. Doctor 99.1840 24.1411 4.109 0.000039968683609 ***
## xSexFemale -35.6747 7.9216 -4.503 0.000006720755139 ***
## xSexOthers -3.4947 10.6568 -0.328 0.742963
## fPython 65.2063 23.0058 2.834 0.004596 **
## nCodeExp:fPython 0.5181 0.4263 1.215 0.224305
## xEdu2. College:fPython -13.1711 24.7197 -0.533 0.594165
## xEdu3. Degree:fPython -29.5913 23.4319 -1.263 0.206651
## xEdu4. Master:fPython -33.6110 24.5523 -1.369 0.171028
## xEdu5. Doctor:fPython -12.6106 33.3084 -0.379 0.704987
## xSexFemale:fPython 3.0483 12.7931 0.238 0.811667
## xSexOthers:fPython 11.2847 16.4352 0.687 0.492331
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 258.4 on 21641 degrees of freedom
## Multiple R-squared: 0.1676, Adjusted R-squared: 0.167
## F-statistic: 290.4 on 15 and 21641 DF, p-value: < 0.00000000000000022
ho <- USlm1 %>% broom::tidy()
broom::glance(USlm1)## # A tibble: 1 x 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.168 0.167 258. 290. 0 15 -151016. 302066. 302202.
## deviance df.residual nobs
## <dbl> <int> <int>
## 1 1445013783. 21641 21657
Based on the observation in US Model 1, we group Below College & College together as Without Degree, and group Master & Doctor together as Postgraduate. And run the 2nd linear model with interaction between Education Level & Coding Experience.
USbase2 <- USbase %>%
mutate(xEdu2=factor(xEdu,
levels=c("1. BelowCollege",
"2. College",
"3. Degree",
"4. Master",
"5. Doctor" ),
labels=c(rep("1. Without Degree", 2),
rep("2. Undergraduate",1),
rep("3. Postgraduate",2) ) )
)
USlm2 <- USbase2 %>%
lm(Comp_US ~ nCodeExp * xEdu2 + xSex + fPython , .)
summary(USlm2)##
## Call:
## lm(formula = Comp_US ~ nCodeExp * xEdu2 + xSex + fPython, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1142.74 -187.84 -9.44 190.72 724.43
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 1437.7990 4.3687 329.114
## nCodeExp 13.4079 0.4324 31.007
## xEdu22. Undergraduate 55.4926 4.6914 11.828
## xEdu23. Postgraduate 117.6905 5.8229 20.212
## xSexFemale -34.9788 6.2100 -5.633
## xSexOthers 2.4010 8.0993 0.296
## fPython 39.0994 3.5892 10.893
## nCodeExp:xEdu22. Undergraduate 0.2286 0.5119 0.447
## nCodeExp:xEdu23. Postgraduate -4.0361 0.6173 -6.539
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## nCodeExp < 0.0000000000000002 ***
## xEdu22. Undergraduate < 0.0000000000000002 ***
## xEdu23. Postgraduate < 0.0000000000000002 ***
## xSexFemale 0.0000000179645 ***
## xSexOthers 0.767
## fPython < 0.0000000000000002 ***
## nCodeExp:xEdu22. Undergraduate 0.655
## nCodeExp:xEdu23. Postgraduate 0.0000000000634 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 258 on 21648 degrees of freedom
## Multiple R-squared: 0.1699, Adjusted R-squared: 0.1696
## F-statistic: 554 on 8 and 21648 DF, p-value: < 0.00000000000000022
ho <- USlm2 %>% broom::tidy()
broom::glance(USlm2)## # A tibble: 1 x 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.170 0.170 258. 554. 0 8 -150985. 301990. 302070.
## deviance df.residual nobs
## <dbl> <int> <int>
## 1 1440888661. 21648 21657
In the 2nd linear model, noted that Undergrade has similar interaction with Coding Experience as Without Degree; and Gender Others may be indifferent from Male.
jtools::plot_summs(USlm1,
USlm2,
scale = T,
robust = list(FALSE,
"HC3"),
model.names = c("Model 1 Interaction with Python",
"Model 2 Interaction with Edu Level"),
plot.distributions = T)## Loading required namespace: broom.mixed
predicted_Y_values_tbl <- USbase2 %>% # input our dataset
modelr::data_grid(nCodeExp, # Explanatory Variable = X1
xEdu2, # Moderator = X2
fPython = 0.5, # take 0.5 for control
xSex = "Male" ) %>% # take mode
add_predictions(USlm2) %>%
rename(predicted_Y = pred)
# predicted_Y_values_tbl %>% print(n = nrow(.) )To undid the centering of Coding Experience (nCodeExp) and transforming of predicted income (predicted_Y). The interplay charts showing that Postgraduate may start with higher income, but their rate of increment is lower than Undergraduate.
predicted_Y_values_tbl <- predicted_Y_values_tbl %>%
mutate(nCodeExp = nCodeExp + mean(USbase2$oCodeExp),
oPredict_Y = ( predicted_Y*lambda_US + 1 )^(1/lambda_US)
)
predicted_Y_values_tbl %>%
ggplot(aes(x = nCodeExp, y = oPredict_Y))+
geom_point(data=USbase2, aes(x=oCodeExp, y=ConvertedCompYearly, color=factor(xEdu2)) , alpha=0.1)+
geom_line(aes(y=oPredict_Y,color=factor(xEdu2)) ,size=2)+
labs(x = "Years of Professional Coding", y = "Annual Income",
color= "Edu Level") +
facet_wrap(~factor(xEdu2) ) +
theme(legend.position="none") +
labs(title="The Interplay of Education and Coding Experience on Annual Income",
subtitle=paste0("Postgrad got higher income at the early stage, ",
"but lower rate of increment than Undergrad") )Next step we will focus on comparing Postgraduate and Undergraduate, to see their income changing with years of professional coding experience.
As expected, Gender Others is indifferent from Male and Postgraduate (fPostgrad) usually has a positive impact to income. However, interaction term has negative estimate, which means Postgraduate’s income would increase with coding experience slower when compared to Undergradate.
USbase3 <- USbase %>%
filter(!xEdu %in% c("1. BelowCollege", "2. College") ) %>%
mutate(fPostgrad=ifelse(xEdu=="3. Degree", 0, 1 ) )
USlm3 <- USbase3 %>%
lm(Comp_US ~ nCodeExp * fPostgrad + xSex + fPython , .)
summary(USlm3)##
## Call:
## lm(formula = Comp_US ~ nCodeExp * fPostgrad + xSex + fPython,
## data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1135.7 -186.3 -10.9 187.3 685.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1494.6485 2.8659 521.525 < 0.0000000000000002 ***
## nCodeExp 13.6194 0.2734 49.818 < 0.0000000000000002 ***
## fPostgrad 62.5186 4.5961 13.602 < 0.0000000000000002 ***
## xSexFemale -30.6403 6.5558 -4.674 0.00000298 ***
## xSexOthers -1.7697 8.8863 -0.199 0.842
## fPython 35.3884 3.9056 9.061 < 0.0000000000000002 ***
## nCodeExp:fPostgrad -4.2553 0.5122 -8.309 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 254.3 on 17593 degrees of freedom
## Multiple R-squared: 0.1633, Adjusted R-squared: 0.163
## F-statistic: 572.2 on 6 and 17593 DF, p-value: < 0.00000000000000022
ho <- USlm3 %>% broom::tidy()
broom::glance(USlm3)## # A tibble: 1 x 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.163 0.163 254. 572. 0 6 -122450. 244915. 244978.
## deviance df.residual nobs
## <dbl> <int> <int>
## 1 1137977561. 17593 17600
predicted_Y_values_tbl <- USbase3 %>% # input our dataset
modelr::data_grid(nCodeExp, # Explanatory Variable = X1
fPostgrad, # Moderator = X2
fPython = 0.5, # take 0.5 for control
xSex = "Male" ) %>% # take mode
add_predictions(USlm3) %>%
rename(predicted_Y = pred)
# predicted_Y_values_tbl %>% print(n = nrow(.) )
# We undid the centering of variable (Coding Experience).
predicted_Y_values_tbl <- predicted_Y_values_tbl %>%
mutate(nCodeExp = nCodeExp + mean(USbase3$oCodeExp),
oPredict_Y = ( predicted_Y*lambda_US + 1 )^(1/lambda_US)
)interactions::interact_plot(USlm3, # plug in your model
pred = nCodeExp, # X1 = Explanatory variable
modx = fPostgrad, # X2 = Moderator
modx.labels = c("Undergrad", # = 0
"Postgrad"), # = 1
legend.main = "Edu Level",
interval = T, # display Confidence Intervals
int.width = 0.95,
colors = c("tomato3",
"deepskyblue3")
) +
theme(legend.position = "top")interactions::sim_slopes(USlm3,
pred = nCodeExp, # x-axis
modx = fPostgrad, # legend
johnson_neyman = F) # set J-N intervals at F## SIMPLE SLOPES ANALYSIS
##
## Slope of nCodeExp when fPostgrad = 0.00 (0):
##
## Est. S.E. t val. p
## ------- ------ -------- ------
## 13.62 0.27 49.82 0.00
##
## Slope of nCodeExp when fPostgrad = 1.00 (1):
##
## Est. S.E. t val. p
## ------ ------ -------- ------
## 9.36 0.44 21.46 0.00
interactions::sim_slopes(USlm3,
pred = fPostgrad,
modx = nCodeExp,
johnson_neyman = T) # set J-N intervals at T## JOHNSON-NEYMAN INTERVAL
##
## When nCodeExp is OUTSIDE the interval [11.40, 19.63], the slope of
## fPostgrad is p < .05.
##
## Note: The range of observed values of nCodeExp is [-9.69, 40.31]
##
## SIMPLE SLOPES ANALYSIS
##
## Slope of fPostgrad when nCodeExp = -8.7864122 (- 1 SD):
##
## Est. S.E. t val. p
## ------- ------ -------- ------
## 99.91 6.68 14.96 0.00
##
## Slope of fPostgrad when nCodeExp = -0.3913421 (Mean):
##
## Est. S.E. t val. p
## ------- ------ -------- ------
## 64.18 4.62 13.90 0.00
##
## Slope of fPostgrad when nCodeExp = 8.0037280 (+ 1 SD):
##
## Est. S.E. t val. p
## ------- ------ -------- ------
## 28.46 5.91 4.81 0.00
# ```{r fig.align = "center", fig.width = 10, fig.height= 10, warning = F}
interactions::interact_plot(USlm3, # plug in your model
pred = nCodeExp, # X1 = Explanatory variable
modx = fPostgrad, # X2 = Moderator
legend.main = "Education Level",
modx.labels = c("Undergrad", # = 0
"Postgrad"), # = 1
interval = T, # display Confidence Intervals
int.width = 0.95,
colors = c("tomato3",
"deepskyblue3"),
plot.points = T, # REVEAL your data points
point.alpha = 0.1,
jitter = 0.2
) +
# Set benchmark for regrions of significance 11.40, 19.63
geom_vline(xintercept = 11.4, color = "red", size = 2) +
geom_vline(xintercept = 19.63, color = "red", size = 2) +
scale_y_continuous(limits = c(1200, 2100) ) +
annotate("rect",
fill = "yellow",
alpha = 0.1,
xmin = -10,
xmax = 11.4,
ymin = 1200,
ymax = 2100) +
annotate("rect",
fill = "yellow",
alpha = 0.1,
xmin = 19.63,
xmax = 40,
ymin = 1200,
ymax = 2100) +
annotate("text",
x = 30,
y = 1400,
label = "Regions of significance\nby Johnson-Neyman Analysis \nwith alpha at 5%") +
labs(title="Interplay of Education and Coding Experience on Annual Income",
subtitle=paste0("Postgraduate got higher income at the early stage, ",
"\nbut became lower than Undergraduate after 28.9 years of coding experience" ),
x = paste0("Years of Professional Coding (Centered, mean=",
round(mean(USbase3$oCodeExp), digits=2),")" ),
y = "Annual Compensation (BoxCox Transformed)") +
theme(legend.position = "top",
text = element_text(family = "Courier"),
axis.title = element_text()
)We ran three regression models for US respondents. The 1st model regressed coding experience, education level, gender and usage of Python, along with interaction terms with usage of Python, onto BoxCox Transformed Income (USlm1).
\[ \operatorname{Comp\_US} = \alpha + \beta_{1}(\operatorname{nCodeExp}) + \beta_{2}(\operatorname{xEdu}_{\operatorname{2.\ College}}) + \beta_{3}(\operatorname{xEdu}_{\operatorname{3.\ Degree}}) + \beta_{4}(\operatorname{xEdu}_{\operatorname{4.\ Master}}) + \beta_{5}(\operatorname{xEdu}_{\operatorname{5.\ Doctor}}) + \beta_{6}(\operatorname{xSex}_{\operatorname{Female}}) \\+ \beta_{7}(\operatorname{xSex}_{\operatorname{Others}}) + \beta_{8}(\operatorname{fPython}) + \beta_{9}(\operatorname{nCodeExp} \times \operatorname{fPython}) + \beta_{10}(\operatorname{xEdu}_{\operatorname{2.\ College}} \times \operatorname{fPython}) + \beta_{11}(\operatorname{xEdu}_{\operatorname{3.\ Degree}} \times \operatorname{fPython}) \\+ \beta_{12}(\operatorname{xEdu}_{\operatorname{4.\ Master}} \times \operatorname{fPython}) + \beta_{13}(\operatorname{xEdu}_{\operatorname{5.\ Doctor}} \times \operatorname{fPython}) + \beta_{14}(\operatorname{xSex}_{\operatorname{Female}} \times \operatorname{fPython}) + \beta_{15}(\operatorname{xSex}_{\operatorname{Others}} \times \operatorname{fPython}) + \epsilon \]
As the 1st model showed no interaction of usage of Python with other independent variables, We ran the 2nd model which regressed coding experience, education level, gender and usage of Python, along with interaction term between coding experience and education level, onto BoxCox Transformed Income (USlm2).
\[ \operatorname{Comp\_US} = \alpha + \beta_{1}(\operatorname{nCodeExp}) + \beta_{2}(\operatorname{xEdu2}_{\operatorname{2.\ Undergraduate}}) + \beta_{3}(\operatorname{xEdu2}_{\operatorname{3.\ Postgraduate}}) + \beta_{4}(\operatorname{xSex}_{\operatorname{Female}}) + \beta_{5}(\operatorname{xSex}_{\operatorname{Others}}) \\+ \beta_{6}(\operatorname{fPython}) + \beta_{7}(\operatorname{nCodeExp} \times \operatorname{xEdu2}_{\operatorname{2.\ Undergraduate}}) + \beta_{8}(\operatorname{nCodeExp} \times \operatorname{xEdu2}_{\operatorname{3.\ Postgraduate}}) + \epsilon \]
The last model is similar to the 2nd, but we focus on comparing Postgraduate and Undergraduate, to see their income changing with years of professional coding experience. The 3rd model regressed coding experience, gender and usage of Python, along with interaction term between coding experience and postgrad indicator, onto BoxCox Transformed Income (USlm3).
\[ \operatorname{Comp\_US} = \alpha + \beta_{1}(\operatorname{nCodeExp}) + \beta_{2}(\operatorname{fPostgrad}) + \beta_{3}(\operatorname{xSex}_{\operatorname{Female}}) + \beta_{4}(\operatorname{xSex}_{\operatorname{Others}}) \\+ \beta_{5}(\operatorname{fPython}) + \beta_{6}(\operatorname{nCodeExp} \times \operatorname{fPostgrad}) + \epsilon \]
library(knitr)
library(kableExtra) ##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
broom::tidy(USlm1)%>%
kbl(caption = "US Model 1") %>%
kable_styling("striped", full_width = T, fixed_thead = T, font_size = 12) %>%
column_spec(c(1, 5), bold = T) %>%
row_spec(c(3, 8, 10,11,12,13,14,15,16), bold = F, color = "darkgray")| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 1443.9734970 | 14.0139153 | 103.0385491 | 0.0000000 |
| nCodeExp | 12.4600696 | 0.2669671 | 46.6726850 | 0.0000000 |
| xEdu2. College | -11.4552131 | 15.0161488 | -0.7628596 | 0.4455555 |
| xEdu3. Degree | 50.1826753 | 14.3044855 | 3.5081776 | 0.0004521 |
| xEdu4. Master | 110.9655227 | 15.2819523 | 7.2612138 | 0.0000000 |
| xEdu5. Doctor | 99.1840375 | 24.1411277 | 4.1085089 | 0.0000400 |
| xSexFemale | -35.6746869 | 7.9216310 | -4.5034523 | 0.0000067 |
| xSexOthers | -3.4947368 | 10.6567722 | -0.3279358 | 0.7429634 |
| fPython | 65.2063304 | 23.0058058 | 2.8343424 | 0.0045963 |
| nCodeExp:fPython | 0.5180892 | 0.4263423 | 1.2151955 | 0.2243048 |
| xEdu2. College:fPython | -13.1710921 | 24.7197064 | -0.5328175 | 0.5941654 |
| xEdu3. Degree:fPython | -29.5913374 | 23.4318938 | -1.2628658 | 0.2066510 |
| xEdu4. Master:fPython | -33.6110053 | 24.5523240 | -1.3689541 | 0.1710278 |
| xEdu5. Doctor:fPython | -12.6106231 | 33.3083649 | -0.3786023 | 0.7049869 |
| xSexFemale:fPython | 3.0483107 | 12.7930664 | 0.2382783 | 0.8116675 |
| xSexOthers:fPython | 11.2847142 | 16.4352276 | 0.6866175 | 0.4923312 |
kable(broom::glance(USlm1))%>%
kable_styling("striped", full_width = T, font_size = 12) %>%
column_spec(c(2, 4, 6, 8, 10, 12), bold = T, color = "white", background = "#ff6347")| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.1675688 | 0.1669918 | 258.4029 | 290.4228 | 0 | 15 | -151016.2 | 302066.4 | 302202.1 | 1445013783 | 21641 | 21657 |
broom::tidy(USlm2)%>%
kbl(caption = "US Model 2") %>%
kable_styling("striped", full_width = T, fixed_thead = T, font_size = 12) %>%
column_spec(c(1, 5), bold = T) %>%
row_spec(c(6, 8), bold = F, color = "darkgray")| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 1437.799039 | 4.3686993 | 329.1137579 | 0.0000000 |
| nCodeExp | 13.407912 | 0.4324132 | 31.0071754 | 0.0000000 |
| xEdu22. Undergraduate | 55.492579 | 4.6914335 | 11.8284910 | 0.0000000 |
| xEdu23. Postgraduate | 117.690532 | 5.8228831 | 20.2117285 | 0.0000000 |
| xSexFemale | -34.978754 | 6.2099793 | -5.6326683 | 0.0000000 |
| xSexOthers | 2.400981 | 8.0992819 | 0.2964437 | 0.7668941 |
| fPython | 39.099434 | 3.5892462 | 10.8934945 | 0.0000000 |
| nCodeExp:xEdu22. Undergraduate | 0.228564 | 0.5118888 | 0.4465111 | 0.6552325 |
| nCodeExp:xEdu23. Postgraduate | -4.036089 | 0.6172565 | -6.5387547 | 0.0000000 |
kable(broom::glance(USlm2))%>%
kable_styling("striped", full_width = T, font_size = 12) %>%
column_spec(c(2, 4, 6, 8, 10, 12), bold = T, color = "white", background = "#ff6347")| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.1699451 | 0.1696384 | 257.992 | 554.0254 | 0 | 8 | -150985.2 | 301990.5 | 302070.3 | 1440888661 | 21648 | 21657 |
broom::tidy(USlm3)%>%
kbl(caption = "US Model 3") %>%
kable_styling("striped", full_width = T, fixed_thead = T, font_size = 12) %>%
column_spec(c(1, 5), bold = T) %>%
row_spec(c(5), bold = F, color = "darkgray")| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 1494.648523 | 2.8659174 | 521.5253347 | 0.000000 |
| nCodeExp | 13.619403 | 0.2733816 | 49.8182970 | 0.000000 |
| fPostgrad | 62.518650 | 4.5961217 | 13.6024792 | 0.000000 |
| xSexFemale | -30.640344 | 6.5558317 | -4.6737539 | 0.000003 |
| xSexOthers | -1.769707 | 8.8862643 | -0.1991509 | 0.842147 |
| fPython | 35.388359 | 3.9056070 | 9.0609115 | 0.000000 |
| nCodeExp:fPostgrad | -4.255334 | 0.5121597 | -8.3086082 | 0.000000 |
kable(broom::glance(USlm3))%>%
kable_styling("striped", full_width = T, font_size = 12) %>%
column_spec(c(2, 4, 6, 8, 10, 12), bold = T, color = "white", background = "#ff6347")| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.163277 | 0.1629916 | 254.3296 | 572.1788 | 0 | 6 | -122449.7 | 244915.4 | 244977.7 | 1137977561 | 17593 | 17600 |
From our data science project, we could find the following two findings:
Nominal Income of developers among countries are significantly different; based on the survey data and out of the countries under our study. US has the highest while France has the lowest income for developers. After considering the personal income tax rates, the ranking is still similar that US has the highest while France has the lowest after tax income for developers; noted that Germany & Netherlands who used to have slightly higher nominal income than Canada, but their after tax income would become lower than Canada. If factoring in Big Mac Index to reflect their purchasing power, India would be the highest and far above the next highest country (US).
Regarding the interplay of , if only sampling US respondents for further analysis, we could observe:
Impact of Below College & College is indifferent, and impact of Master & Doctor to income may be similar
It is sad to say this but shown by US Model 1 (USlm1) that on average Female may earn less, and developers using Python may earn more (while this project is conducted by R).
As expected, US Model 3 (USlm3) has shown that coding experience will have positive impact to income and postgraduate will be better than Undgergraduate. However, interestingly the interaction term between postgraduate and coding experience has negative impact to income, which means income increase with coding experience for postgraduate would be slower than undergraduate, even though postgraduate starts at a higher income level, undergraduate might catch up after about 28.9 years of professional coding experience.
In terms of purchasing power, developers working in India earn much higher than other countries, but the 2nd best country (US) may be a better option for developers who want to migrate to other countries, as US on average has the highest after tax income, considering that income is not just for paying living expenses, it could also be used to exploit investment opportunites around the world.
US Model 3 (USlm3) showed that in the long run (after 28.9 years) postgraduate developers may earn less than undergraduate; but this conclusion came from observing different subjects with different years of coding experience at specific snapshots 2019-2021 (cross-sectional study), instead of by repeatedly measuring the same pool of subjects over time. Their academic major would be playing a more critical role; what postgraduate learnt in 30 years ago might not be very relevant to the challenges faced by developers or data scientists today, hence we could see postgradate with less experience earns more, given they just start their career, what they learnt should be the most updated and fitted to the job market demands. Moreover it must be cautious when investigating their causalty relationship; it could be because developers with slower career progress tried to take further studies to enhance their market values while it might be ineffective. Besides, it could be because of other confounders not considered in the model, for instance, postgraduate took some more times to obtain their Master/Doctor degree and started their career later; even though both of them have the same 30 years of coding experience, postgraduate would be older and might have less opportunities than undergraduate developers. It is interesting that data reveals something different from layman expectation, but further study would be required to understand why this happened.
There is no comprehensive adjustment indicator to truly reflect the purchasing power. While ingredients & standards of Big Mac may be similar across countries, consumption of Big Mac may only occupy a small portion of personal income. And market demands may distort the price of Big Mac too; for instance, as a number of indian are vegetarian or not having beef, Big Mac may not be a common good targeting the general public in india. Therefore, different from other countries, Big Mac Index may not be a proper measure of Indian purchasing power. For developed countries & metropolis, a significant portion of personal income in general may be spent on housing/rents, housing price & rents may be considered to adjust the income, while property price sometimes could be over-valued due to speculation.
By right India would have more lower-income deverlopers, but a lot of them would have been removed as outliers when we identify outliers based on the respondents of the Top 10 countries of which many are from Western countries. In the future we may consider to remove outliers by country by country.
For simplicity, Tax Rates updated in 2021 are applied to the survey data collected in 2018 - 2020.
Conclusion of the review is only based on the survey results, which might not be used to inference the population given that number of respondents was relatively small and a number of them did not provide income information in survey.
sessionInfo()## R version 4.1.0 (2021-05-18)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 22000)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_Hong Kong SAR.1252
## [2] LC_CTYPE=English_Hong Kong SAR.1252
## [3] LC_MONETARY=English_Hong Kong SAR.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_Hong Kong SAR.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] kableExtra_1.3.4 knitr_1.33 rvest_1.0.1
## [4] equatiomatic_0.3.0.9000 CGPfunctions_0.6.3 plotly_4.10.0
## [7] broom_0.7.9 modelr_0.1.8 lubridate_1.7.10
## [10] forcats_0.5.1 stringr_1.4.0 dplyr_1.0.7
## [13] purrr_0.3.4 readr_2.0.1 tidyr_1.1.3
## [16] tibble_3.1.4 ggplot2_3.3.5 tidyverse_1.3.1
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.2.1 jtools_2.1.4
## [4] systemfonts_1.0.3 lazyeval_0.2.2 splines_4.1.0
## [7] crosstalk_1.1.1 digest_0.6.27 interactions_1.1.5
## [10] htmltools_0.5.2 fansi_0.5.0 magrittr_2.0.1
## [13] paletteer_1.4.0 tzdb_0.1.2 svglite_2.0.0
## [16] sandwich_3.0-1 colorspace_2.0-2 ggrepel_0.9.1
## [19] haven_2.4.3 xfun_0.25 crayon_1.4.1
## [22] jsonlite_1.7.2 libcoin_1.0-9 Exact_3.0
## [25] lme4_1.1-27.1 survival_3.2-11 zoo_1.8-9
## [28] glue_1.4.2 gtable_0.3.0 emmeans_1.7.0
## [31] webshot_0.5.2 MatrixModels_0.5-0 sjstats_0.18.1
## [34] sjmisc_2.8.7 scales_1.1.1 mvtnorm_1.1-2
## [37] DBI_1.1.1 Rcpp_1.0.7 viridisLite_0.4.0
## [40] xtable_1.8-4 performance_0.8.0 ggstance_0.3.5
## [43] proxy_0.4-26 Formula_1.2-4 datawizard_0.2.1
## [46] htmlwidgets_1.5.4 httr_1.4.2 ellipsis_0.3.2
## [49] pkgconfig_2.0.3 farver_2.1.0 sass_0.4.0
## [52] dbplyr_2.1.1 utf8_1.2.2 tidyselect_1.1.1
## [55] labeling_0.4.2 rlang_0.4.11 later_1.3.0
## [58] effectsize_0.5 munsell_0.5.0 cellranger_1.1.0
## [61] tools_4.1.0 cli_3.0.1 generics_0.1.1
## [64] pacman_0.5.1 sjlabelled_1.1.8 evaluate_0.14
## [67] fastmap_1.1.0 yaml_2.2.1 rematch2_2.1.2
## [70] fs_1.5.0 pander_0.6.4 ggmosaic_0.3.3
## [73] rootSolve_1.8.2.3 pbapply_1.5-0 nlme_3.1-152
## [76] mime_0.12 xml2_1.3.2 compiler_4.1.0
## [79] rstudioapi_0.13 e1071_1.7-9 reprex_2.0.1
## [82] bslib_0.3.1 DescTools_0.99.43 stringi_1.7.4
## [85] highr_0.9 parameters_0.15.0 lattice_0.20-44
## [88] Matrix_1.3-3 nloptr_1.2.2.2 vctrs_0.3.8
## [91] pillar_1.6.4 lifecycle_1.0.1 jquerylib_0.1.4
## [94] estimability_1.3 data.table_1.14.0 insight_0.14.5
## [97] lmom_2.8 httpuv_1.6.2 R6_2.5.1
## [100] promises_1.2.0.1 gridExtra_2.3 BayesFactor_0.9.12-4.2
## [103] gld_2.6.2 boot_1.3-28 MASS_7.3-54
## [106] gtools_3.9.2 assertthat_0.2.1 withr_2.4.2
## [109] broom.mixed_0.2.7 bayestestR_0.11.0 expm_0.999-6
## [112] parallel_4.1.0 hms_1.1.0 grid_4.1.0
## [115] rpart_4.1-15 coda_0.19-4 class_7.3-19
## [118] minqa_1.2.4 rmarkdown_2.10 inum_1.0-4
## [121] partykit_1.2-15 gvlma_1.0.0.3 shiny_1.7.1
options(warn = defaultW)